1 .- Introducción

https://www.business-science.io/learn-r/2020/04/20/setup-python-in-r-with-rmarkdown.html

CONCURSO DE ANÁLISIS DE DATOS CON MOTIVO DE LOS 200 AÑOS DEL NACIMIENTO DE FLORENCE NIGHTINGALE

Con el objetivo de conmemorar los 200 años del nacimiento de Florence Nightingale, precursora de la estadística y epidemiología moderna, y primera mujer admitida en la Royal Statistical Society, el colectivo de Rladies España (nodos de Barcelona, Madrid y Bilbao) en colaboración con la Sociedad Española de Biometría y la Sociedad Catalana de Estadística se complace en anunciar el concurso: “200 AÑOS DE FLORENCE NIGHTINGALE”.

El objetivo del concurso consiste en analizar el conjunto de datos que Florence Nightingale utilizó para analizar las causas de mortalidad del ejército británico durante la guerra de Crimea y que sirvió para determinar los factores asociados a la alta mortalidad y reducirla significativamente.

https://github.com/rladies/spain_nightingale/blob/master/README.md

https://github.com/rladies/spain_nightingale/blob/master/datos_florence.xlsx

2 .- Limpieza y preparación entorno trabajo

# Limpiamos el entorno de Trabajo
rm(list=ls())

# Limpiamos la consola
cat("\014")
# Comprobamos que está bien establecido el directorio
getwd()
## [1] "/home/oscar/Documentos/R/Github/florence"
dir()
## [1] "florence.html" "florence.Rmd"  "header.html"
#indicamos el directorio de trabajo
setwd("~/Documentos/R/Github/florence")

2.1 .- Carga de librerias

# Importamos las librerias a utilizar

packages <- c( "reshape2","rio","csvy", "feather", "fst", "hexView", "readODS", "rmatio", "magrittr","dplyr", "ggfortify", "zoo","GGally","ggplot2","forecast","tidyquant","dygraphs","psych")
newpack  = packages[!(packages %in% installed.packages()[,"Package"])]

if(length(newpack)) install.packages(newpack)
a=lapply(packages, library, character.only=TRUE)

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

cat("\nCabecera y primesas líneas: \n")
## 
## Cabecera y primesas líneas:
head(raw)
##       ...1                 ...2           Deaths              ...4
## 1    Month Average size of army Zymotic diseases Wounds & injuries
## 2 Apr 1854                 8571                1                 0
## 3 May 1854                23333               12                 0
## 4 Jun 1854                28333               11                 0
## 5 Jul 1854                28722              359                 0
## 6 Aug 1854                30246              828                 1
##               ...5 Annual rate of mortality per 1000              ...7
## 1 All other causes                  Zymotic diseases Wounds & injuries
## 2                5                               1.4                 0
## 3                9                               6.2                 0
## 4                6                               4.7                 0
## 5               23                               150                 0
## 6               30                             328.5               0.4
##                 ...8
## 1   All other causes
## 2                  7
## 3 4.5999999999999996
## 4                2.5
## 5                9.6
## 6               11.9
cat("\nNombre de  las columnas: v")
## 
## Nombre de  las columnas: v
names(raw)
## [1] "...1"                              "...2"                             
## [3] "Deaths"                            "...4"                             
## [5] "...5"                              "Annual rate of mortality per 1000"
## [7] "...7"                              "...8"
cat("\nClase: \n")
## 
## Clase:
class(raw)
## [1] "data.frame"
cat("\nDimensión: \n")
## 
## Dimensión:
dim(raw)
## [1] 25  8
names(raw) <- c("mes-año", "tamaño","Muertes x Zymotic", "Muertes x Heridas/Lesiones", "Otras Causas", "Tasa anual x1000 Zymotic", "Tasa anual x1000 Heridas/Lesiones","Tasa anual x1000 otras")
d <- raw[c(-1,-2,-3),]
d$`mes-año`[d$`mes-año`=="Aug_1855"]="Aug 1855"
as.Date(d$`mes-año`, "%m %Y")
##  [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
# simplificamos nombres
names(d)<- c("fecha", "tamaño","zymotic", "heridas", "otras", "zymotic1000año", "heridas1000año","otras1000año")
str(d)
## 'data.frame':    22 obs. of  8 variables:
##  $ fecha         : chr  "Jun 1854" "Jul 1854" "Aug 1854" "Sep 1854" ...
##  $ tamaño        : chr  "28333" "28722" "30246" "30290" ...
##  $ zymotic       : chr  "11" "359" "828" "788" ...
##  $ heridas       : chr  "0" "0" "1" "81" ...
##  $ otras         : chr  "6" "23" "30" "70" ...
##  $ zymotic1000año: chr  "4.7" "150" "328.5" "312.2" ...
##  $ heridas1000año: chr  "0" "0" "0.4" "32.1" ...
##  $ otras1000año  : chr  "2.5" "9.6" "11.9" "27.7" ...
# Convertimos en numeric
atr <- c("tamaño","zymotic", "heridas", "otras", "zymotic1000año", "heridas1000año","otras1000año")
for (i in atr){
  d[,i]<- as.numeric(d[,i])
}
# Redondeamos los últimos números
cols <- names(d[6:8])
for (i in cols){
  d[,i] <- round(d[,i], digits = 2)
}
glimpse(d)
## Rows: 22
## Columns: 8
## $ fecha          <chr> "Jun 1854", "Jul 1854", "Aug 1854", "Sep 1854", "Oct 1…
## $ tamaño         <dbl> 28333, 28722, 30246, 30290, 30643, 29736, 32779, 32393…
## $ zymotic        <dbl> 11, 359, 828, 788, 503, 844, 1725, 2761, 2120, 1205, 4…
## $ heridas        <dbl> 0, 0, 1, 81, 132, 287, 114, 83, 42, 32, 48, 49, 209, 1…
## $ otras          <dbl> 6, 23, 30, 70, 128, 106, 131, 324, 361, 172, 57, 37, 3…
## $ zymotic1000año <dbl> 4.7, 150.0, 328.5, 312.2, 197.0, 340.6, 631.5, 1022.8,…
## $ heridas1000año <dbl> 0.0, 0.0, 0.4, 32.1, 51.7, 115.8, 41.7, 30.7, 16.3, 12…
## $ otras1000año   <dbl> 2.5, 9.6, 11.9, 27.7, 50.1, 42.8, 48.0, 120.0, 140.1, …

2.2 .-ETL

# convierto la muertes mensuales por 1000
col <- names(d[3:5])
for (i in col){
  d[,i]<- d[,i]/d$tamaño*1000
  d[,i]
}
df <- dplyr::select(d, -tamaño)
names(df) <- c("fecha","zymotic", "heridas", "otras", "zymoticAÑO", "heridasAÑO","otrasAÑO")
df$fecha <- gsub("_", " ", df$fecha, fixed = TRUE)
# Separar fecha
df <- df %>%
  tidyr::separate(fecha, c("mes", "año"), sep = " ")

2.3 .-Resumen de datos

summary(df)
##      mes                año               zymotic           heridas      
##  Length:22          Length:22          Min.   : 0.3251   Min.   :0.0000  
##  Class :character   Class :character   1st Qu.: 3.0385   1st Qu.:0.1381  
##  Mode  :character   Mode  :character   Median :13.4099   Median :1.3699  
##                                        Mean   :20.1083   Mean   :2.2017  
##                                        3rd Qu.:27.0354   3rd Qu.:3.3939  
##                                        Max.   :85.2345   Max.   :9.6516  
##      otras           zymoticAÑO        heridasAÑO         otrasAÑO      
##  Min.   : 0.2118   Min.   :   3.90   Min.   :  0.000   Min.   :  2.500  
##  1st Qu.: 0.6756   1st Qu.:  36.48   1st Qu.:  1.625   1st Qu.:  8.125  
##  Median : 0.9186   Median : 160.90   Median : 16.450   Median : 11.000  
##  Mean   : 2.4075   Mean   : 241.30   Mean   : 26.423   Mean   : 28.882  
##  3rd Qu.: 3.2513   3rd Qu.: 324.43   3rd Qu.: 40.700   3rd Qu.: 39.025  
##  Max.   :11.6757   Max.   :1022.80   Max.   :115.800   Max.   :140.100
skimr::skim(df)
Data summary
Name df
Number of rows 22
Number of columns 8
_______________________
Column type frequency:
character 2
numeric 6
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
mes 0 1 3 3 0 12 0
año 0 1 4 4 0 3 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
zymotic 0 1 20.11 23.09 0.33 3.04 13.41 27.04 85.23 ▇▂▁▁▁
heridas 0 1 2.20 2.45 0.00 0.14 1.37 3.39 9.65 ▇▃▂▁▁
otras 0 1 2.41 3.12 0.21 0.68 0.92 3.25 11.68 ▇▂▁▁▁
zymoticAÑO 0 1 241.30 277.10 3.90 36.47 160.90 324.42 1022.80 ▇▂▁▁▁
heridasAÑO 0 1 26.42 29.38 0.00 1.62 16.45 40.70 115.80 ▇▃▂▁▁
otrasAÑO 0 1 28.88 37.39 2.50 8.12 11.00 39.02 140.10 ▇▂▁▁▁

2.4 .-Boxplot

boxplot(df[3:8])

#library(epiDisplay)
epiDisplay::codebook(df)
## 
##  
##  
## mes   :    
## A character vector 
## 
##  ================== 
## año   :    
## A character vector 
## 
##  ================== 
## zymotic   :    
##  obs. mean   median  s.d.   min.   max.  
##  22   20.108 13.41   23.092 0.325  85.234
## 
##  ================== 
## heridas   :    
##  obs. mean   median  s.d.   min.   max.  
##  22   2.202  1.37    2.449  0      9.652 
## 
##  ================== 
## otras     :    
##  obs. mean   median  s.d.   min.   max.  
##  22   2.407  0.919   3.115  0.212  11.676
## 
##  ================== 
## zymoticAÑO    :    
##  obs. mean   median  s.d.    min.   max.  
##  22   241.3  160.9   277.098 3.9    1022.8
## 
##  ================== 
## heridasAÑO    :    
##  obs. mean   median  s.d.   min.   max.  
##  22   26.423 16.45   29.377 0      115.8 
## 
##  ================== 
## otrasAÑO      :    
##  obs. mean   median  s.d.   min.   max.  
##  22   28.882 11      37.386 2.5    140.1 
## 
##  ==================
epiDisplay::summ(df)
## 
## No. of observations = 22
## 
##   Var. name  obs. mean   median  s.d.   min.   max.  
## 1 mes                                                
## 2 año                                                
## 3 zymotic    22   20.11  13.41   23.09  0.33   85.23 
## 4 heridas    22   2.2    1.37    2.45   0      9.65  
## 5 otras      22   2.41   0.92    3.12   0.21   11.68 
## 6 zymoticAÑO 22   241.3  160.9   277.1  3.9    1022.8
## 7 heridasAÑO 22   26.42  16.45   29.38  0      115.8 
## 8 otrasAÑO   22   28.88  11      37.39  2.5    140.1
visdat::vis_dat(df)

2.5 .-Matrices

#library(psych)
pairs.panels(df, pch=21,main="Matriz de Dispersión, Histograma y Correlación")

textscatter <- function(df, mapping, ...) {
   ggplot(df, mapping, ...) + geom_text()
}

ggpairs(
  df, 
  title="Scatterplot de Variables",
  columns = c(3:8),
  mapping=ggplot2::aes(colour = año))

  lower = list(continuous = textscatter)

3 .- Time-Series

pas1.ts <- ts(df["zymotic"], start = c(1854, 6), frequency = 12)
str(pas1.ts)
##  Time-Series [1:22, 1] from 1854 to 1856: 0.388 12.499 27.376 26.015 16.415 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr "zymotic"
pas1.ts
##             Jan        Feb        Mar        Apr        May        Jun
## 1854                                                         0.3882399
## 1855 85.2344642 68.5662538 40.0239147 14.7897805 14.3207510 20.6365952
## 1856  0.9499683  0.5519145  0.3250975                                 
##             Jul        Aug        Sep        Oct        Nov        Dec
## 1854 12.4991296 27.3755207 26.0151865 16.4148419 28.3831047 52.6251564
## 1855  8.9572537 10.8261981  3.9580323  2.7320072  4.7024014  2.1056529
## 1856
pas2.ts <- ts(df[c(4)], start = c(1854, 6), frequency = 12)
pas3.ts <- ts(df[c(5)], start = c(1854, 6), frequency = 12)
pas4.ts <- ts(df[c(6)], start = c(1854, 6), frequency = 12)
pas5.ts <- ts(df[c(7)], start = c(1854, 6), frequency = 12)
pas6.ts <- ts(df[c(8)], start = c(1854, 6), frequency = 12)
pass.ts <- ts(df[c(3:8)], start = c(1854, 6), frequency = 12)
str(pass.ts)
##  Time-Series [1:22, 1:6] from 1854 to 1856: 0.388 12.499 27.376 26.015 16.415 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:6] "zymotic" "heridas" "otras" "zymoticAÑO" ...
pass.ts
##             zymotic    heridas      otras zymoticAÑO heridasAÑO otrasAÑO
## Jun 1854  0.3882399 0.00000000  0.2117672        4.7        0.0      2.5
## Jul 1854 12.4991296 0.00000000  0.8007799      150.0        0.0      9.6
## Aug 1854 27.3755207 0.03306222  0.9918667      328.5        0.4     11.9
## Sep 1854 26.0151865 2.67414988  2.3109937      312.2       32.1     27.7
## Oct 1854 16.4148419 4.30767223  4.1771367      197.0       51.7     50.1
## Nov 1854 28.3831047 9.65160075  3.5647027      340.6      115.8     42.8
## Dec 1854 52.6251564 3.47783642  3.9964611      631.5       41.7     48.0
## Jan 1855 85.2344642 2.56228197 10.0021610     1022.8       30.7    120.0
## Feb 1855 68.5662538 1.35838805 11.6756687      822.8       16.3    140.1
## Mar 1855 40.0239147 1.06287574  5.7129571      480.3       12.8     68.6
## Apr 1855 14.7897805 1.48827980  1.7673323      177.5       17.9     21.2
## May 1855 14.3207510 1.38133228  1.0430468      171.8       16.6     12.5
## Jun 1855 20.6365952 5.37786584  0.7976739      247.6       64.5      9.6
## Jul 1855  8.9572537 3.14207330  0.7737942      107.5       37.7      9.3
## Aug 1855 10.8261981 3.67597615  0.5603622      129.9       44.1      6.7
## Sep 1855  3.9580323 5.77998367  0.4188394       47.5       69.4      5.0
## Oct 1855  2.7320072 1.13122172  0.3841885       32.8       13.6      4.6
## Nov 1855  4.7024014 0.87179352  0.8453755       56.4       10.5     10.1
## Dec 1855  2.1056529 0.41650277  0.6478932       25.3        5.0      7.8
## Jan 1856  0.9499683 0.04523659  1.0856781       11.4        0.5     13.0
## Feb 1856  0.5519145 0.00000000  0.4369323        6.6        0.0      5.2
## Mar 1856  0.3250975 0.00000000  0.7585609        3.9        0.0      9.1
#library(ggfortify)
#library(zoo)
autoplot(pass.ts)

plot(pass.ts)

autoplot(pas1.ts, ts.colour = "red", ts.linetype = "dashed")

# autoplot(stl(pas1.ts, s.window = "periodic"), ts.colour="blue")
autoplot(pacf(pas1.ts, plot = FALSE))

autoplot(acf(pas1.ts, plot = FALSE), conf.int.fill = "#0000FF", conf.int.value = 0.8,
         conf.int.type = "ma")

autoplot(spec.ar(pas1.ts, plot = FALSE))

autoplot(spec.ar(pas2.ts, plot = FALSE))

Periodograma acumulado

ggcpgram(arima.sim(list(ar = c(0.7, -0.5)), n = 50))

ggtsdiag debería generar el diagrama similar a tsdiag.

#library(forecast)
ggtsdiag(auto.arima(pas1.ts))

gglagplot(pas1.ts, lags = 4)

ggfreqplot(pas1.ts)

ggfreqplot(pas1.ts, freq = 4)

arima1<-forecast::auto.arima(pas1.ts)
forecast1<-forecast::forecast(arima1,level = c(95), h = 50)
autoplot(forecast1)

autoplot(forecast1, ts.colour = "firebrick1", predict.colour = "red",
         predict.linetype = "dashed", conf.int = FALSE)

## Season plots

forecast::ggseasonplot(pas1.ts, year.labels=TRUE, year.labels.left=TRUE)

forecast::ggseasonplot(pas1.ts, year.labels=TRUE, year.labels.left=TRUE, polar = TRUE)

3.1 Ruido Gaussiano linealmente dependiente en el tiempo

alpha <- 1
beta <- 0.1
t <- 1:22
mu <- alpha + beta*t

fit <- lm(pas1.ts ~ t)     #calcula la regresión  lm=modelo lineal
summary(fit)    #slow y el interceptro   y=ax+b
## 
## Call:
## lm(formula = pas1.ts ~ t)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.239  -6.964  -5.483  -1.099  59.620 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  38.2002     9.3660   4.079 0.000585 ***
## t            -1.5732     0.7131  -2.206 0.039237 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.22 on 20 degrees of freedom
## Multiple R-squared:  0.1957, Adjusted R-squared:  0.1555 
## F-statistic: 4.867 on 1 and 20 DF,  p-value: 0.03924
plot(fit)

dplyr::tibble(time = t, value = pas1.ts) %>%
  ggplot2::ggplot(ggplot2::aes(x = time, y = value)) +
  ggplot2::geom_line() +
  ggplot2::geom_abline(intercept = fit$coefficients[1], slope = fit$coefficients[2], col = "red")   #quiero calcular la lina roja con una regresión

# SOI= LA SERIE TEMPORAL
# REC= CANTIDAD DE PECES QUE SE HAN PESCADO
soi.lag6 <- xts::lag.xts(pas1.ts,6)  # desplazar una serie temporal 6 veces, con lo que x1 es igual a y7, x2 igual a y8, x3 igual a y9....
fit <- lm(pas2.ts ~ soi.lag6)    # esta es la relación que hace
summary(fit)
## 
## Call:
## lm(formula = pas2.ts ~ soi.lag6)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4551 -1.0010 -0.5978  0.2863  3.2081 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  0.77900    0.59724   1.304   0.2132  
## soi.lag6     0.04480    0.01687   2.656   0.0188 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.55 on 14 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.335,  Adjusted R-squared:  0.2875 
## F-statistic: 7.053 on 1 and 14 DF,  p-value: 0.01882
dplyr::tibble(time = zoo::index(pas2.ts), 
              actual = zoo::coredata(pas2.ts),
              estimated = c(rep(0,6),fit$fitted.values)) %>%
  tidyr::gather(pas1.ts, Value, -time) %>%
  ggplot2::ggplot(ggplot2::aes(x = time, y = Value, col = pas1.ts)) +
  ggplot2::geom_line()

plot(fit)

# library(reshape2)
meltdf <- reshape2::melt(d,id="fecha")
ggplot(meltdf,aes(x=fecha,y=value,colour=variable,group=variable)) + geom_line()

3.2 tslm rewritten

# autoplot of a forecast object
fc <- forecast::forecast(pas1.ts)
autoplot(fc)

# autoplot of a forecast object
fc <- forecast::forecast(pass.ts)
autoplot(fc)

# Plotting the components of an ETS model
fit <- forecast::ets(pas1.ts)
autoplot(fit)

# Plotting the inverse characteristic roots of an ARIMA model
fit <- forecast::auto.arima(pas1.ts, D=1)
autoplot(fit)

ggtsdisplay(pas1.ts)

ggseasonplot(pas1.ts)

deaths.lm  <- tslm(pas1.ts ~ trend + fourier(pas1.ts,3))
mdeaths.fcast <- forecast(deaths.lm,
    data.frame(fourier(pas1.ts,3,36)))
autoplot(mdeaths.fcast)

deaths.lm  <- tslm(pas2.ts ~ trend + fourier(pas2.ts,3))
mdeaths.fcast <- forecast(deaths.lm,
    data.frame(fourier(pas2.ts,3,36)))
autoplot(mdeaths.fcast)

deaths.lm  <- tslm(pas3.ts ~ trend + fourier(pas3.ts,3))
mdeaths.fcast <- forecast(deaths.lm,
    data.frame(fourier(pas3.ts,3,36)))
autoplot(mdeaths.fcast)

deaths.lm  <- tslm(pas4.ts ~ trend + fourier(pas4.ts,3))
mdeaths.fcast <- forecast(deaths.lm,
    data.frame(fourier(pas4.ts,3,36)))
autoplot(mdeaths.fcast)

deaths.lm  <- tslm(pas5.ts ~ trend + fourier(pas5.ts,3))
mdeaths.fcast <- forecast(deaths.lm,
    data.frame(fourier(pas5.ts,3,36)))
autoplot(mdeaths.fcast)

deaths.lm  <- tslm(pas6.ts ~ trend + fourier(pas6.ts,3))
mdeaths.fcast <- forecast(deaths.lm,
    data.frame(fourier(pas6.ts,3,36)))
autoplot(mdeaths.fcast)

# left
autoplot(pass.ts)

# right
autoplot(pass.ts, facets = TRUE)

autoplot(pass.ts, facets = TRUE) +
  geom_smooth() +
  labs("Muertes y tasa",
       y = "Value(in thousands)",
       x = NULL)

ggseasonplot(pas1.ts, year.labels=FALSE, continuous=TRUE)

ggseasonplot(pas1.ts, year.labels=FALSE, continuous=TRUE, polar = TRUE)

# left: autoplot of the beer data
autoplot(pas1.ts)

# middle: lag plot of the beer data
gglagplot(pas1.ts)

# right: ACF plot of the beer data
ggAcf(pas1.ts)

#library(tidyquant)
df %>%
    ggplot(aes(x = mes, y = zymotic, group = año)) +
    geom_area(aes(fill = año), position = "stack") +
    labs(title = "Quantity Sold: Month Plot", x = "", y = "Sales",
         subtitle = "March through July tend to be most active") +
    scale_y_continuous() +
    theme_tq()

dygraph(pass.ts, main = "New Haven Temperatures") %>%
  dyRangeSelector()
dygraph(pass.ts, main="Muertes") %>%
  dySeries(label="valor por mil", color="black") %>%
  dyShading(from="1800-1-1", to="1855-1-1", color="#FFE6E6") %>%
  dyShading(from="1856-1-1", to="2000-1-1", color="#CCEBD6")

# Graficos

p <- ggplot(d, aes(x = zymotic, y=heridas, size = tamaño)) +
  geom_point(show.legend = FALSE, alpha = 0.7) +
  scale_color_viridis_d() +
  scale_size(range = c(2, 12)) +
  scale_x_log10() +
  labs(x = "zymotic", y = "heridas100año")
p